# Corona data processing - African Green Monkeys

library(VAM)
library(Seurat)
library(ggplot2)
library(doParallel)
library("GSEABase")
library(data.table)

# minimalClusterSize defines the percentage of cells that constitute the minimal expected cluster size 
minimalClusterSize = 10
logScalingConstant = 1
minNumOfGenesExpressed = 1000

filterData <- function(dataMatrix, isLogTPM, convertToCPM)
{
  filteredDataMatrix <- dataMatrix
  
  if (isLogTPM == TRUE)
  {
    filteredDataMatrix <- 2^(filteredDataMatrix) - logScalingConstant
  }
  
  # Filtering out cells which express less than the minimal number of genes
  expressedGenesCounters <- apply(filteredDataMatrix != 0, 2, sum)
  cellsWithAThousandPlus <- expressedGenesCounters >= minNumOfGenesExpressed
  filteredDataMatrix <- filteredDataMatrix[, cellsWithAThousandPlus]
  expressedGenesCounters <- expressedGenesCounters[cellsWithAThousandPlus]
  
  # Filtering out genes which are expressed by less than the minimal expected cluster size of cells
  nonZeroCellCountsForGenes <- apply(filteredDataMatrix != 0, 1, sum)
  totalCellsCount <- ncol(filteredDataMatrix)
  minNumOfCellsInClust <- totalCellsCount * (minimalClusterSize / 100)
  genesWithMinExpression <- (nonZeroCellCountsForGenes > minNumOfCellsInClust)
  filteredDataMatrix <- filteredDataMatrix[genesWithMinExpression,]
  
  # Converting the transcript counts to CPM
  if (convertToCPM == TRUE)
  {
    countSumsOfCells <- apply(filteredDataMatrix, 2, sum)
    filteredDataMatrix <- t(filteredDataMatrix)
    filteredDataMatrix <- (filteredDataMatrix / countSumsOfCells) * 1000000
    filteredDataMatrix <- t(filteredDataMatrix)
  }
  
  return (filteredDataMatrix)
}

# This function uses the VAM package to calculate pathway scores and produces graphics showing the difference between the monkey groups
calculateDifferentialExpression <- function(pathwaysList, dataMatrix, cellType)
{
  all_P_Values_healthy_vs_3Days <- numeric()
  all_P_Values_healthy_vs_10Days <- numeric()
  all_P_Values_10Days_vs_3Days <- numeric()
  all_effect_differences_healthy_vs_3Days <- numeric()
  all_effect_differences_healthy_vs_10Days <- numeric()
  all_effect_differences_3Days_vs_10Days <- numeric()
  
  genesList <- rownames(dataMatrix)
  genesets_for_currCellType <- createGeneSetCollection(gene.ids = genesList, gene.set.collection = pathwaysList)
  results_for_currCellType <- vamForCollection(gene.expr = t(dataMatrix), gene.set.collection = genesets_for_currCellType)
  pathwayScores <- results_for_currCellType$distance.sq
  pathwayScores <- t(pathwayScores)
    
  saveRDS(pathwayScores, file = paste0(cellType, "_VAM_All_Pathway_Scores.RDS"))
  
  foreach(currGenesetIndex = 1:nrow(pathwayScores), .packages = "ggplot2") %do%
    {
      pathwayName <- rownames(pathwayScores)[currGenesetIndex]
      
      scoresAsDataFrame <- as.data.frame(pathwayScores[pathwayName,])
      names(scoresAsDataFrame)[1] <- "Score"
      scoresAsDataFrame$cellName <- rownames(scoresAsDataFrame)
      scoresAsDataFrame$monkeyClass <- "Infected-10Days"
      
      # Resetting the monkey group of all cells not in the 10 Days group
      scoresAsDataFrame$monkeyClass[grepl(pattern="AGM1_L*", x = scoresAsDataFrame$cellName)] <- "Healthy"
      scoresAsDataFrame$monkeyClass[grepl(pattern="AGM2_L*", x = scoresAsDataFrame$cellName)] <- "Healthy"
      scoresAsDataFrame$monkeyClass[grepl(pattern="AGM3_L*", x = scoresAsDataFrame$cellName)] <- "Infected-3Days"
      scoresAsDataFrame$monkeyClass[grepl(pattern="AGM4_L*", x = scoresAsDataFrame$cellName)] <- "Infected-3Days"
      scoresAsDataFrame$monkeyClass[grepl(pattern="AGM5_L*", x = scoresAsDataFrame$cellName)] <- "Infected-3Days"
      scoresAsDataFrame$monkeyClass[grepl(pattern="AGM6_L*", x = scoresAsDataFrame$cellName)] <- "Infected-3Days"
      
      T.TestResult <- pairwise.t.test(scoresAsDataFrame$Score, scoresAsDataFrame$monkeyClass, p.adjust.method = "none")
      
      healthy_vs_3Days_P_Val <- T.TestResult[[3]]["Infected-3Days", "Healthy"]
      healthy_vs_10Days_P_Val <- T.TestResult[[3]]["Infected-10Days", "Healthy"]
      tenDays_vs_3Days_P_Val <- T.TestResult[[3]]["Infected-3Days", "Infected-10Days"]
      
      all_P_Values_healthy_vs_3Days[pathwayName] <- healthy_vs_3Days_P_Val
      all_P_Values_healthy_vs_10Days[pathwayName] <- healthy_vs_10Days_P_Val
      all_P_Values_10Days_vs_3Days[pathwayName] <- tenDays_vs_3Days_P_Val
      
      # Calculating and storing effect size differences
      Healthy_median <- median(scoresAsDataFrame$Score[scoresAsDataFrame$monkeyClass == "Healthy"])
      ThreeDays_median <- median(scoresAsDataFrame$Score[scoresAsDataFrame$monkeyClass == "Infected-3Days"])
      TenDays_median <- median(scoresAsDataFrame$Score[scoresAsDataFrame$monkeyClass == "Infected-10Days"])
      
      all_effect_differences_healthy_vs_3Days[pathwayName] <- Healthy_median - ThreeDays_median
      all_effect_differences_healthy_vs_10Days[pathwayName] <- Healthy_median - TenDays_median
      all_effect_differences_3Days_vs_10Days[pathwayName] <- ThreeDays_median - TenDays_median
      
      violinPlot <- ggplot(scoresAsDataFrame, aes(x = monkeyClass, y = Score, fill = monkeyClass)) +
        ggtitle(paste0("T-Test healthy vs. 3-Days (Unadjusted!): P < ", healthy_vs_3Days_P_Val, "\n",
                       "T-Test healthy vs. 10-Days (Unadjusted!): P < ", healthy_vs_10Days_P_Val, "\n",
                       "T-Test 3-Days vs. 10-Days (Unadjusted!): P < ", tenDays_vs_3Days_P_Val, "\n",
                       "Effect size healthy vs. 3-Days: ", all_effect_differences_healthy_vs_3Days[pathwayName], "\n",
                       "Effect size healthy vs. 10-Days: ", all_effect_differences_healthy_vs_10Days[pathwayName], "\n",
                       "Effect size 3-Days vs. 10-Days: ", all_effect_differences_3Days_vs_10Days[pathwayName])) +
        geom_violin(trim=FALSE) + geom_boxplot(width=0.1)
      
      pdf(paste0(cellType, "_", "VAM_", pathwayName, "_LungCells.pdf"))
      print(violinPlot)
      dev.off()
    }
  
  write.csv2(all_P_Values_healthy_vs_3Days, file = paste0(cellType, "_VAM_unadjusted_p_values_Lung_healthy_vs_3Days.csv")) 
  FDR_Adjusted_P_Vals <- p.adjust(all_P_Values_healthy_vs_3Days, method = "BH", n = length(all_P_Values_healthy_vs_3Days))
  write.csv2(FDR_Adjusted_P_Vals, file = paste0(cellType, "_VAM_adjusted_p_values_Lung_healthy_vs_3Days.csv"))
  
  write.csv2(all_P_Values_healthy_vs_10Days, file = paste0(cellType, "_VAM_unadjusted_p_values_Lung_healthy_vs_10Days.csv")) 
  FDR_Adjusted_P_Vals <- p.adjust(all_P_Values_healthy_vs_10Days, method = "BH", n = length(all_P_Values_healthy_vs_10Days))
  write.csv2(FDR_Adjusted_P_Vals, file = paste0(cellType, "_VAM_adjusted_p_values_Lung_healthy_vs_10Days.csv"))
  
  write.csv2(all_P_Values_10Days_vs_3Days, file = paste0(cellType, "_VAM_unadjusted_p_values_Lung_10Days_vs_3Days.csv"))
  FDR_Adjusted_P_Vals <- p.adjust(all_P_Values_10Days_vs_3Days, method = "BH", n = length(all_P_Values_10Days_vs_3Days))
  write.csv2(FDR_Adjusted_P_Vals, file = paste0(cellType, "_VAM_adjusted_p_values_Lung_10Days_vs_3Days.csv"))
  
  write.csv2(all_effect_differences_healthy_vs_3Days, file = paste0(cellType, "_VAM_Effect_Sizes_Healthy_vs_3Days.csv"))
  write.csv2(all_effect_differences_healthy_vs_10Days, file = paste0(cellType, "_VAM_Effect_Sizes_Healthy_vs_10Days.csv"))
  write.csv2(all_effect_differences_3Days_vs_10Days, file = paste0(cellType, "_VAM_Effect_Sizes_3Days_vs_10Days.csv"))
}


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MAIN program starts here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

setwd("")

# Reading the cell names of the different cell types
alveolarCells <- fread(file="Epithelial_cells_list.txt", sep = "\n")
BCells <- fread(file="B_cells_list.txt", sep = "\n")
TCells <- fread(file="T_cells_list.txt", sep="\n")

# Reading the 10X data and extracting the relevant RNA counts out of it
Dir_prefix <- "AGM"
Dir_suffix <- seq(1:10)
Data_Directories <- paste(Dir_prefix, Dir_suffix, "_Lung", sep = "")
names(Data_Directories) <- Data_Directories
All_Monkeys_counts <-Read10X(data.dir = Data_Directories)

allAlveolarCells <- All_Monkeys_counts[, alveolarCells[[1]]]
filteredAlveolarCells <- filterData(dataMatrix = allAlveolarCells, isLogTPM = FALSE, convertToCPM = TRUE)

allBCells <- All_Monkeys_counts[, BCells[[1]]]
filteredBCells <- filterData(dataMatrix = allBCells, isLogTPM = FALSE, convertToCPM = TRUE)

allTCells <- All_Monkeys_counts[, TCells[[1]]]
filteredTCells <- filterData(dataMatrix = allTCells, isLogTPM = FALSE, convertToCPM = TRUE)

rm(All_Monkeys_counts)
rm(alveolarCells)
rm(BCells)
rm(TCells)
rm(allAlveolarCells)
rm(allBCells)
rm(allTCells)
gc()

GMT_FILE_NAME <- "h.all.v7.0.symbols.pluscc.gmt"
GENESET_NAME <- "HALLMARKS"
genesets <- getGmt(GMT_FILE_NAME)
vam_genesets <- geneIds(genesets)

calculateDifferentialExpression(pathwaysList = vam_genesets, dataMatrix = filteredAlveolarCells, cellType = "Epithelial")
calculateDifferentialExpression(pathwaysList = vam_genesets, dataMatrix = filteredBCells, cellType = "B_Cells")
calculateDifferentialExpression(pathwaysList = vam_genesets, dataMatrix = filteredTCells, cellType = "T_Cells")

